{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Underfitting, overfitting, regularization, dropout"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Load and minibatch MNIST data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Fix for juliabox\n",
    "if ENV[\"HOME\"] == \"/mnt/juliabox\"; Pkg.dir(path...)=joinpath(\"/home/jrun/.julia/v0.6\",path...); end"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "using Knet, Plots, Images\n",
    "plotly()    # use the plotly backend for Plots\n",
    "Knet.gpu()  # should return device-id >= 0 if there is a gpu"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load data, show some samples\n",
    "include(Knet.dir(\"data\",\"mnist.jl\"))\n",
    "xtrn,ytrn,xtst,ytst = mnist()\n",
    "for a in (xtrn,ytrn,xtst,ytst); println(summary(a)); end"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Show some samples\n",
    "for i=1:3; display(mnistview(xtst,i)); end\n",
    "ytst[1:3]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Minibatch data\n",
    "Atype = gpu() >= 0 ? KnetArray{Float32} : Array{Float32}\n",
    "dtst = minibatch(xtst,ytst,100;xtype=Atype) # [ (x1,y1), (x2,y2), ... ] where xi,yi are minibatches of 100\n",
    "dtrn = minibatch(xtrn,ytrn,100;xtype=Atype) # [ (x1,y1), (x2,y2), ... ] where xi,yi are minibatches of 100\n",
    "length(dtrn),length(dtst)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Here is the first minibatch\n",
    "(x,y)=first(dtst)\n",
    "println(summary(x))  # 4-D Float32 array with X,Y,C,N\n",
    "println(summary(y))  # 1-D integer array"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Define linear model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "function linear(w,x)\n",
    "    y = w[1]*mat(x) .+ w[2]\n",
    "end\n",
    "\n",
    "winit1(;std=0.01)=map(Atype, [ std*randn(10,784), zeros(10,1) ])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "setseed(9)\n",
    "w = winit1()  # random weight matrix and a zero bias vector"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "ypred = linear(w,x)\n",
    "summary(ypred) # predictions are given as a 10xN score matrix"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "display(y')   # correct answers are given as an array of integers"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Measuring error rate"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "accuracy(ypred,y)  # 2-arg version: accuracy on this batch of 100 with initial w"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "accuracy(w,dtst,linear)  # 3-arg version: accuracy on the whole test dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# zeroone loss (error) defined as 1 - accuracy\n",
    "zeroone(w,data,model) = 1 - accuracy(w,data,model)\n",
    "zeroone(w,dtst,linear)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Measuring loss"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Calculate cross entropy loss of a model with weights w for one minibatch (x,y)\n",
    "# Use the predict function to get model output: ypred = predict(w,x;o...)\n",
    "# Use non-zero l1 or l2 for regularization (only on matrices not biases)\n",
    "function softloss(w,x,y,predict;l1=0,l2=0,o...)\n",
    "    J = nll(predict(w,x;o...),y)\n",
    "    if l1 != 0; J += Float32(l1) * sum(sum(abs,wi)  for wi in w[1:2:end]); end\n",
    "    if l2 != 0; J += Float32(l2) * sum(sum(abs2,wi) for wi in w[1:2:end]); end\n",
    "    return J\n",
    "end"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "softloss(w,x,y,linear)  # per-instance average softloss for the first test minibatch"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "function avgloss(w,data,predict) # average loss for the whole dataset\n",
    "    sum = cnt = 0\n",
    "    for (x,y) in data\n",
    "        sum += softloss(w,x,y,predict)\n",
    "        cnt += 1\n",
    "    end\n",
    "    return sum/cnt\n",
    "end"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "avgloss(w,dtst,linear)  # per-instance average softloss for the whole test set, should be close to -log(1/10)=2.3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Manual loss calculation\n",
    "ypred=linear(w,x)\n",
    "yp1 = exp.(ypred)\n",
    "yp2 = yp1 ./ sum(yp1,1)\n",
    "yp3 = -log.(yp2)\n",
    "yc1 = full(sparse(y,1:100,1f0))\n",
    "sum(Array(yp3).*yc1) / 100"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Calculating gradient"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Manually defined gradient for softloss\n",
    "function softgrad_manual(w,x,y,predict)\n",
    "    x = mat(x)\n",
    "    p = predict(w,x)\n",
    "    p = p .- maximum(p,1) # for numerical stability\n",
    "    expp = exp.(p)\n",
    "    p = expp ./ sum(expp,1)\n",
    "    q = oftype(p, sparse(convert(Vector{Int},y),1:length(y),1,size(p,1),length(y)))\n",
    "    dJdy = (p - q) / size(x,2)\n",
    "    dJdw = dJdy * x'\n",
    "    dJdb = sum(dJdy,2)\n",
    "    Any[dJdw,dJdb]\n",
    "end"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Automatically defined gradient for softloss\n",
    "softgrad = grad(softloss)  # Knet/AutoGrad makes life easier :)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "setseed(9)\n",
    "w1 = winit1(std=0.1)  # use a larger std to get a larger gradient for this example"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "g1 = softgrad_manual(w1,x,y,linear)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "g2 = softgrad(w1,x,y,linear)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "isapprox(g1[1],g2[1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "isapprox(g1[2],g2[2])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Checking the gradient"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "display(g1[2]')  \n",
    "# Meaning of gradient:\n",
    "# If I move the last entry of w[2] by epsilon, loss will go up by 0.345075 epsilon!"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "display(w1[2]')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "softloss(w1,x,y,linear)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "w1[2][10] = 0.1   # to numerically check the gradient let's move the last entry by +0.1.\n",
    "display(w1[2]')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "softloss(w1,x,y,linear)  \n",
    "# We see that the loss moves by +0.03 as expected.\n",
    "# You should check all/most entries in your gradients this way to make sure they are correct."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Training loop (SGD)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Train model(w) with SGD and return a list containing w for every epoch\n",
    "function train(w,data,predict; epochs=100,lr=0.1,o...)\n",
    "    weights = Any[deepcopy(w)]\n",
    "    for epoch in 1:epochs\n",
    "        for (x,y) in data\n",
    "            g = softgrad(w,x,y,predict;o...)\n",
    "            update!(w,g,lr=lr)  # w[i] = w[i] - lr * g[i]\n",
    "        end\n",
    "        push!(weights,deepcopy(w))\n",
    "    end\n",
    "    return weights\n",
    "end"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Training the linear model and underfitting"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "setseed(1)\n",
    "@time trn1=train(winit1(),dtrn,linear,lr=0.1)  # 31.1s\n",
    "@time trnloss1 = [ avgloss(w,dtrn,linear) for w in trn1 ]  # 22.2s\n",
    "@time tstloss1 = [ avgloss(w,dtst,linear) for w in trn1 ]  # 3.7s\n",
    "@time trnerr1 = [ zeroone(w,dtrn,linear) for w in trn1 ]   # 20.6s\n",
    "@time tsterr1 = [ zeroone(w,dtst,linear) for w in trn1 ]   # 3.4s\n",
    "minimum(tstloss1),minimum(tsterr1)  # 0.2667, 0.0744"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "plot([trnloss1 tstloss1],ylim=(.2,.36),labels=[:trnloss :tstloss],xlabel=\"Epochs\",ylabel=\"Loss\") \n",
    "# Demonstrates underfitting: training loss not close to 0\n",
    "# Also slight overfitting: test loss higher than train"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "plot([trnerr1 tsterr1],ylim=(.06,.10),labels=[:trnerr :tsterr],xlabel=\"Epochs\",ylabel=\"Error\")  \n",
    "# this is the error plot, we get to about 7.5% test error, i.e. 92.5% accuracy"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Multi-layer linear model does not improve results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Let us try to concatenate multiple linear layers\n",
    "function multilinear(w,x)\n",
    "    for i=1:2:length(w)\n",
    "        x = w[i]*mat(x) .+ w[i+1]\n",
    "    end\n",
    "    return x\n",
    "end"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Weight initialization for multiple layers: x=input size, y=output size, h=array of hidden layer sizes\n",
    "# Output is an array [w0,b0,w1,b1,...,wn,bn] where wi,bi is the weight matrix and bias vector for the i'th layer\n",
    "function winit(h...; std=0.01, x=784, y=10)  # use winit(h1,h2,...,hn) for n hidden layer model\n",
    "    h = [x, h..., y]\n",
    "    w = Any[]\n",
    "    for i=1:length(h)-1\n",
    "        push!(w, std*randn(h[i+1],h[i]))\n",
    "        push!(w, zeros(h[i+1],1))\n",
    "    end\n",
    "    map(Atype, w)\n",
    "end"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "w1m=winit(64) # gives weights and biases for a multi layer model with a single hidden layer of size 64"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "softloss(w1m,x,y,multilinear)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "setseed(1)\n",
    "@time trn1m=train(winit(64),dtrn,multilinear,lr=0.1)  # 33.9s\n",
    "#@time trn1m=train(w2,dtrn,multilinear,lr=0.01,epochs=10)  # 33.9s\n",
    "@time trnloss1m = [ avgloss(w,dtrn,multilinear) for w in trn1m ]  # 22.2s\n",
    "@time tstloss1m = [ avgloss(w,dtst,multilinear) for w in trn1m ]  # 3.73s\n",
    "@time trnerr1m = [ zeroone(w,dtrn,multilinear) for w in trn1m ]   # 22.8s\n",
    "@time tsterr1m = [ zeroone(w,dtst,multilinear) for w in trn1m ]   # 3.84s\n",
    "minimum(tstloss1m),minimum(tsterr1m)  # 0.285, 0.0797"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "plot([trnloss1 tstloss1 trnloss1m tstloss1m],ylim=(0.2,0.4),\n",
    "    labels=[:trnLin :tstLin :trnMulti :tstMulti],xlabel=\"Epochs\",ylabel=\"Loss\")  \n",
    "# multilinear converges to a similar solution, not identical because problem is non-convex"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "plot([trnerr1 tsterr1 trnerr1m tsterr1m],ylim=(0.06,0.12),\n",
    "    labels=[:trnLin :tstLin :trnMulti :tstMulti],xlabel=\"Epochs\",ylabel=\"Error\")  \n",
    "# error results also close to the linear model"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Multiple linear layers are useless because they are equivalent to a single linear layer\n",
    "If we write down what is being computed and do some algebra, we can show that what is being computed is still an affine function of the input, i.e. stacking multiple linear layers does not increase the representational capacity of the model:\n",
    "\\begin{align*}\n",
    "\\hat{p} &= \\mbox{soft}(W_2 (W_1 x + b_1) + b_2) \\\\\n",
    "&= \\mbox{soft}((W_2 W_1)\\, x + W_2 b_1 + b_2) \\\\\n",
    "&= \\mbox{soft}(W x + b)\n",
    "\\end{align*}"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Multi Layer Perceptron (MLP) adds non-linearities between layers, shows overfitting"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Using nonlinearities (relu) results in a model with higher capacity which helps underfitting\n",
    "function mlp(w,x)\n",
    "    for i=1:2:length(w)\n",
    "        x = w[i]*mat(x) .+ w[i+1]\n",
    "        if i < length(w)-1  # Apply element-wise non-linearity after every layer except the last\n",
    "            x = relu.(x)    # relu here is the only difference between this and multilinear\n",
    "        end    \n",
    "    end\n",
    "    return x\n",
    "end"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "wmlp=winit(64) # gives weights and biases for an MLP with a single hidden layer of size 64"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "softloss(wmlp,x,y,mlp)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "setseed(1)\n",
    "@time trn3=train(winit(64),dtrn,mlp,lr=0.1)  # 35.4s\n",
    "@time trnloss3 = [ avgloss(w,dtrn,mlp) for w in trn3 ]  # 23.7s\n",
    "@time tstloss3 = [ avgloss(w,dtst,mlp) for w in trn3 ]  # 3.99s\n",
    "@time trnerr3 = [ zeroone(w,dtrn,mlp) for w in trn3 ]   # 23.3s\n",
    "@time tsterr3 = [ zeroone(w,dtst,mlp) for w in trn3 ]   # 3.91s\n",
    "minimum(tstloss3),minimum(tsterr3)  # 0.0887, 0.0234"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "plot([trnloss1 tstloss1 trnloss3 tstloss3],ylim=(0.0,0.4),\n",
    "    labels=[:trnLin :tstLin :trnMLP :tstMLP],xlabel=\"Epochs\",ylabel=\"Loss\")  \n",
    "# Solves the underfitting problem!\n",
    "# A more serious overfitting problem remains."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "plot([trnerr1 tsterr1 trnerr3 tsterr3],ylim=(0,0.1),\n",
    "    labels=[:trnLin :tstLin :trnMLP :tstMLP],xlabel=\"Epochs\",ylabel=\"Error\")  \n",
    "# error improves from 7.5% to 2.3%!"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## MLP with L1 regularization"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# We still have overfitting, let's try L1 regularization\n",
    "srand(1)\n",
    "@time trn4=train(winit(64),dtrn,mlp;lr=0.1,l1=0.00004)  # 47.3s\n",
    "@time trnloss4 = [ avgloss(w,dtrn,mlp) for w in trn4 ]  # 24.8s\n",
    "@time tstloss4 = [ avgloss(w,dtst,mlp) for w in trn4 ]  # 4.17s\n",
    "@time trnerr4 = [ zeroone(w,dtrn,mlp) for w in trn4 ]   # 23.7s\n",
    "@time tsterr4 = [ zeroone(w,dtst,mlp) for w in trn4 ]   # 3.95s\n",
    "minimum(tstloss4),minimum(tsterr4)  # 0.0791, 0.0228"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "plot([trnloss3 tstloss3 trnloss4 tstloss4],ylim=(0,0.15),\n",
    "    labels=[:trnMLP :tstMLP :trnMLP_L1 :tstMLP_L1],xlabel=\"Epochs\", ylabel=\"Loss\")  \n",
    "# overfitting less, test loss improves from 0.0887 to 0.0791"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "plot([trnerr3 tsterr3 trnerr4 tsterr4],ylim=(0,0.04),\n",
    "    labels=[:trnMLP :tstMLP :trnMLP_L1 :tstMLP_L1],xlabel=\"Epochs\", ylabel=\"Error\")    \n",
    "# however test error does not change significantly: 0.0234 -> 0.0228"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    ":mlperr,minimum(tsterr3),:l1err,minimum(tsterr4)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## MLP with dropout"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Dropout is another way to address overfitting\n",
    "function mlpdrop(w,x; pdrop=(0,0))\n",
    "    for i=1:2:length(w)\n",
    "        x = dropout(x, pdrop[i==1?1:2])  # apply one of two dropout rates\n",
    "        x = w[i]*mat(x) .+ w[i+1]\n",
    "        if i < length(w)-1; x = relu.(x); end\n",
    "    end\n",
    "    return x\n",
    "end"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "setseed(1)\n",
    "@time trn5=train(winit(64),dtrn,mlpdrop;lr=0.1,pdrop=(0.2,0))  # 38.9s\n",
    "@time trnloss5 = [ avgloss(w,dtrn,mlpdrop) for w in trn5 ]     # 25.7s\n",
    "@time tstloss5 = [ avgloss(w,dtst,mlpdrop) for w in trn5 ]     # 4.25s\n",
    "@time trnerr5 = [ zeroone(w,dtrn,mlpdrop) for w in trn5 ]      # 24.3s\n",
    "@time tsterr5 = [ zeroone(w,dtst,mlpdrop) for w in trn5 ]      # 4.11s\n",
    "minimum(tstloss5),minimum(tsterr5)  # 0.0645, 0.0186"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "plot([trnloss3 tstloss3 trnloss5 tstloss5],ylim=(0,0.15),\n",
    "    labels=[:trnMLP :tstMLP :trnDropout :tstDropout],xlabel=\"Epochs\", ylabel=\"Loss\")\n",
    "# overfitting less, loss results improve 0.0887 -> 0.0645"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "plot([trnerr3 tsterr3 trnerr5 tsterr5],ylim=(0,0.04),\n",
    "    labels=[:trnMLP :tstMLP :trnDropout :tstDropout],xlabel=\"Epochs\", ylabel=\"Error\")  \n",
    "# this time error also improves 0.0234 -> 0.0186"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    ":mlperr,minimum(tsterr3),:l1err,minimum(tsterr4),:dropouterr,minimum(tsterr5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    ":mlploss,minimum(tstloss3),:l1loss,minimum(tstloss4),:dropoutloss,minimum(tstloss5)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## MLP with larger hidden layer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# The current trend is to use models with higher capacity tempered with dropout\n",
    "setseed(1)\n",
    "@time trn6=train(winit(256),dtrn,mlpdrop;lr=0.1,pdrop=(0.2,0))  # 34.6s\n",
    "@time trnloss6 = [ avgloss(w,dtrn,mlpdrop) for w in trn6 ]      # 21.2s\n",
    "@time tstloss6 = [ avgloss(w,dtst,mlpdrop) for w in trn6 ]      # 3.61s\n",
    "@time trnerr6 = [ zeroone(w,dtrn,mlpdrop) for w in trn6 ]       # 21.7s\n",
    "@time tsterr6 = [ zeroone(w,dtst,mlpdrop) for w in trn6 ]       # 3.63s\n",
    "minimum(tstloss6),minimum(tsterr6)  # 0.0473, 0.0147"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "plot([trnloss5 tstloss5 trnloss6 tstloss6],ylim=(0,0.15),\n",
    "    labels=[:trn64 :tst64 :trn256 :tst256],xlabel=\"Epochs\",ylabel=\"Loss\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "plot([trnerr5 tsterr5 trnerr6 tsterr6],ylim=(0,0.04),\n",
    "    labels=[:trn64 :tst64 :trn256 :tst256],xlabel=\"Epochs\",ylabel=\"Error\")\n",
    "# We are down to 0.015 error."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Julia 0.6.2",
   "language": "julia",
   "name": "julia-0.6"
  },
  "language_info": {
   "file_extension": ".jl",
   "mimetype": "application/julia",
   "name": "julia",
   "version": "0.6.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
